### Initialize workspace.
rm(list = ls(all = TRUE))
setwd("~/Downloads/hbg_replication")

# Load required packages
library(plyr)
library(car)
library(anesrake)

# Load relevant functions.
source("scripts/helper_functions.R")

## Load data.
# Load TPNW experimental data.
tpnw <- read.csv("data/tpnw_raw.csv", stringsAsFactors = FALSE, row.names = 1)

# Load original income question data.
orig_inc <- read.csv("data/tpnw_orig_income.csv", stringsAsFactors = FALSE, 
                     row.names = 1)

# Load YouGov data (including covariates and awareness question).
aware <- read.csv("data/tpnw_aware_raw.csv", stringsAsFactors = FALSE, 
                  row.names = 1)

### Clean TPNW data.
## Clean data.
# Remove first two (extraneous) rows.
tpnw <- tpnw[-c(1, 2),]
orig_inc <- orig_inc[-c(1, 2),]

# Remove respondents who did not consent.
tpnw <- tpnw[tpnw$consent == "1",]
orig_inc <- orig_inc[orig_inc$consent == "1",]

# Coalesce income variables.
orig_inc <- within(orig_inc, {
	income <- as.numeric(income)
	income <- ifelse(income < 1000, NA, income)
	income <- ifelse(income < 15000, 1, income)
	income <- ifelse(income >= 15000 & income < 25000, 2, income)
	income <- ifelse(income >= 25000 & income < 50000, 3, income)
	income <- ifelse(income >= 50000 & income < 75000, 4, income)
	income <- ifelse(income >= 75000 & income < 100000, 5, income)
	income <- ifelse(income >= 100000 & income < 150000, 6, income)
	income <- ifelse(income >= 150000 & income < 200000, 7, income)
	income <- ifelse(income >= 200000 & income < 250000, 8, income)
	income <- ifelse(income >= 250000 & income < 500000, 9, income)
	income <- ifelse(income >= 500000 & income < 1000000, 10, income)
	income <- ifelse(income >= 1000000, 11, income)
})
orig_inc <- data.frame(pid = orig_inc$pid, income_old = orig_inc$income)
tpnw <- plyr::join(tpnw, orig_inc, by = "pid", type = "left")
tpnw <- within(tpnw, {
			income <- coalesce(as.numeric(income), as.numeric(income_old))
		})

# Note meta variables.
meta <- c("consent", "confirmation_code", "new_income_q")

# Note Qualtrics variables.
qualtrics_vars <- c("StartDate", "EndDate", "Status", "Progress", 
                    "Duration..in.seconds.", "Finished", "RecordedDate", 
                    "DistributionChannel", "UserLanguage")

# Note Dynata variables.
dynata_vars <- c("pid", "psid")

# Note non-numeric variables.
char_vars <- c(qualtrics_vars, dynata_vars,
			   c("ResponseId"), names(tpnw)[grep("text", tolower(names(tpnw)))])
char_cols <- which(names(tpnw) %in% char_vars)

# Numericize other variables
tpnw <- data.frame(apply(tpnw[, -char_cols], 2, as.numeric), tpnw[char_cols])

tpnw_atts <- which(names(tpnw) %in% c("danger", "peace", "safe", "use_unaccept", 
									  "always_cheat", "cannot_elim", "slow_reduc"))
names(tpnw)[tpnw_atts] <- paste("tpnw_atts", names(tpnw)[tpnw_atts], sep = "_")

# Coalesce relevant variables.
tpnw <- within(tpnw, {
	# Clean gender variable.
	female <- ifelse(gender == 95, NA, gender)

	# Transform birthyr variable to age.
	age <- 2019 - birthyr

	# Transform income variable.
	income <- car::recode(income, "95 = NA")
	
	# Combine pid and pid_forc variables.
	pid3 <- ifelse(pid3 == 0, pid_forc, pid3)
	
	# Recode ideology variable.
	ideo <- car::recode(ideo, "3 = NA")

	# Recode education variable.
	educ <- car::recode(educ, "95 = NA")

	# Recode state variable.
	state <- recode(state, "1 = 'Alabama';
                          2 = 'Alaska';
                          4 = 'Arizona';
                          5 = 'Arkansas';
                          6 = 'California';
                          8 = 'Colorado';
                          9 = 'Connecticut';
                          10 = 'Delaware';
                          11 = 'Washington DC';
                          12 = 'Florida';
                          13 = 'Georgia';
                          15 = 'Hawaii';
                          16 = 'Idaho';
                          17 = 'Illinois';
                          18 = 'Indiana';
                          19 = 'Iowa';
                          20 = 'Kansas';
                          21 = 'Kentucky';
                          22 = 'Louisiana';
                          23 = 'Maine';
                          24 = 'Maryland';
                          25 = 'Massachusetts';
                          26 = 'Michigan';
                          27 = 'Minnesota';
                          28 = 'Mississippi';
                          29 = 'Missouri';
                          30 = 'Montana';
                          31 = 'Nebraska';
                          32 = 'Nevada';
                          33 = 'New Hampshire';
                          34 = 'New Jersey';
                          35 = 'New Mexico';
                          36 = 'New York';
                          37 = 'North Carolina';
                          38 = 'North Dakota';
                          39 = 'Ohio';
                          40 = 'Oklahoma';
                          41 = 'Oregon';
                          42 = 'Pennsylvania';
                          44 = 'Rhode Island';
                          45 = 'South Carolina';
                          46 = 'South Dakota';
                          47 = 'Tennessee';
                          48 = 'Texas';
                          49 = 'Utah';
                          50 = 'Vermont';
                          51 = 'Virginia';
                          53 = 'Washington';
                          54 = 'West Virginia';
                          55 = 'Wisconsin';
                          56 = 'Wyoming'")

  # Create regional indicators.
  northeast <- state %in% c("Connecticut", "Maine", "Massachusetts", 
                            "New Hampshire", "Rhode Island", "Vermont", 
                            "New Jersey", "New York", "Pennsylvania")
  midwest <- state %in% c("Illinois", "Indiana", "Michigan", "Ohio", 
                          "Wisconsin", "Iowa", "Kansas", "Minnesota", 
                          "Missouri", "Nebraska", "North Dakota", 
                          "South Dakota")
  south <- state %in% c("Delaware", "Florida", "Georgia", "Maryland", 
                        "North Carolina", "South Carolina", "Virginia", 
                        "Washington DC", "West Virginia", "Alabama", 
                        "Kentucky", "Mississippi", "Tennessee", "Arkansas", 
                        "Louisiana", "Oklahoma", "Texas")
  west <- state %in% c("Arizona", "Colorado", "Idaho", "Montana", "Nevada", 
                        "New Mexico", "Utah", "Wyoming", "Alaska", 
                        "California", "Hawaii", "Oregon", "Washington")

  # Recode join_tpnw outcome.
  join_tpnw <- car::recode(join_tpnw, "2 = 0")

  # Create indicator variables for each treatment arm.
  control <- treatment == 0
  group_cue <- treatment == 1
  security_cue <- treatment == 2
  norms_cue <- treatment == 3
  institutions_cue <- treatment == 4

  # Recode attitudinal outcomes.
  tpnw_atts_danger <- recode(tpnw_atts_danger, "-2 = 2; -1 = 1; 1 = -1; 2 = -2")
  tpnw_atts_use_unaccept <- recode(tpnw_atts_use_unaccept, "-2 = 2; -1 = 1; 
                                                            1 = -1; 2 = -2")
  tpnw_atts_always_cheat <- recode(tpnw_atts_always_cheat, "-2 = 2; -1 = 1; 
                                                            1 = -1; 2 = -2")
  tpnw_atts_cannot_elim <- recode(tpnw_atts_cannot_elim, "-2 = 2; -1 = 1; 
                                                            1 = -1; 2 = -2")
})

# Use mean imputation for missingness.
# Redefine char_cols object.
char_cols <- which(names(tpnw) %in% c(char_vars, meta, "state", "pid_forc", 
									  "income_old", "gender"))

# Define out_vars object.
out_vars <- which(names(tpnw) %in% c("join_tpnw", "n_nukes", "n_tests") |
				  startsWith(names(tpnw), "tpnw_atts") | 
				  startsWith(names(tpnw), "physical_eff") |
				  startsWith(names(tpnw), "testing_matrix"))

# Mean impute.
tpnw[,-c(char_cols, out_vars)] <- 
      data.frame(apply(tpnw[, -c(char_cols, out_vars)], 2, function (x) {
 								     replace(x, is.na(x), mean(x, na.rm = TRUE))
							 	 }))

### Clean YouGov data.
## Indicate all non-numeric variables.
# Indicate YouGov metadata variables (e.g., start/end time, respondent ID) that
# may contain characters.
yougov_vars <- c("starttime", "endtime")

# Numericize all numeric variables
aware <- data.frame(apply(aware[, -which(names(aware) %in% yougov_vars)], 2, 
             as.numeric), aware[which(names(aware) %in% yougov_vars)])

# Coalesce relevant variables.
aware <- within(aware, {
  # Clean gender variable to an indicator of female gender (renamed below).
  gender <- recode(gender, "8 = NA") - 1

  # Transform birthyr variable to age (renamed below).
  birthyr <- 2020 - birthyr

  # Recode pid3 variable.
  pid3 <- recode(pid3, "1 = -1; 2 = 1; 3 = 0; c(5, 8, 9) = NA")

  # Recode pid7
  pid7 <- recode(pid7, "1 = -3; 2 = -2; 3 = -1; 4 = 0; 5 = 1; 6 = 2; 7 = 3;
              c(8, 98) = NA")

  # Code pid variable from pid7.
  party <- recode(pid7, "c(-3, -2, -1) = -1; c(1, 2, 3) = 1")

  # Recode ideology variable.
  ideo5 <- recode(ideo5, "c(6, 8, 9) = NA") - 3

  # Recode education variable.
  educ <- recode(educ, "c(8, 9) = NA")

  # Recode state variable.
  state <- recode(inputstate, "1 = 'Alabama';
                               2 = 'Alaska';
                               4 = 'Arizona';
                               5 = 'Arkansas';
                               6 = 'California';
                               8 = 'Colorado';
                               9 = 'Connecticut';
                               10 = 'Delaware';
                               11 = 'Washington DC';
                               12 = 'Florida';
                               13 = 'Georgia';
                               15 = 'Hawaii';
                               16 = 'Idaho';
                               17 = 'Illinois';
                               18 = 'Indiana';
                               19 = 'Iowa';
                               20 = 'Kansas';
                               21 = 'Kentucky';
                               22 = 'Louisiana';
                               23 = 'Maine';
                               24 = 'Maryland';
                               25 = 'Massachusetts';
                               26 = 'Michigan';
                               27 = 'Minnesota';
                               28 = 'Mississippi';
                               29 = 'Missouri';
                               30 = 'Montana';
                               31 = 'Nebraska';
                               32 = 'Nevada';
                               33 = 'New Hampshire';
                               34 = 'New Jersey';
                               35 = 'New Mexico';
                               36 = 'New York';
                               37 = 'North Carolina';
                               38 = 'North Dakota';
                               39 = 'Ohio';
                               40 = 'Oklahoma';
                               41 = 'Oregon';
                               42 = 'Pennsylvania';
                               44 = 'Rhode Island';
                               45 = 'South Carolina';
                               46 = 'South Dakota';
                               47 = 'Tennessee';
                               48 = 'Texas';
                               49 = 'Utah';
                               50 = 'Vermont';
                               51 = 'Virginia';
                               53 = 'Washington';
                               54 = 'West Virginia';
                               55 = 'Wisconsin';
                               56 = 'Wyoming'")

  # Define US Census geographic regions.
  northeast <- inputstate %in% c(9, 23, 25, 33, 44, 50, 34, 36, 42)
  midwest <- inputstate %in% c(18, 17, 26, 39, 55, 19, 20, 27, 29, 31, 38, 46)
  south <- inputstate %in% c(10, 11, 12, 13, 24, 37, 45, 51, 
              54, 1, 21, 28, 47, 5, 22, 40, 48)
  west <- inputstate %in% c(4, 8, 16, 35, 30, 49, 32, 56, 2, 6, 15, 41, 53)

  # Recode employment.
  employ <- recode(employ, "c(9, 98, 99) = NA")

  # Recode outcome.
  awareness <- recode(awareness, "8 = NA")

  # Normalize weights.
  weight <- weight / sum(weight)
})

# Rename demographic questions.
aware <- rename(aware, c("gender" = "female", "birthyr" = "age", 
             "faminc_new" = "income", "ideo5" = "ideo"))

## Impute missing values.
# Specify non-covariate numerical variables (other is exempted since over 10% of 
# responses are missing; state is exempted since the variable is categorical).
non_covars <- names(aware)[names(aware) %in% c("caseid", "starttime", "endtime", 
                                               "awareness", "state", "weight")]

# Use mean imputation for missingness in covariates.
aware[, -which(names(aware) %in% non_covars)] <- 
  data.frame(apply(aware[, -which(names(aware) %in% 
               non_covars)], 2, function (x) {
                     replace(x, is.na(x), mean(x, na.rm = TRUE))
                 }))

### Produce weights for TPNW experimental data using anesrake.
## Create unique identifier variable for assigning weights.
tpnw$caseid <- 1:nrow(tpnw)

## Recode relevant covariates for reweighting: coarsen age; recode female; and 
## recode geographic covariates.
# Coarsen age into a categorical variable for age groups.
tpnw$age_wtng <- cut(tpnw$age, c(0, 25, 35, 45, 55, 65, 99))
levels(tpnw$age_wtng) <- c("age1824", "age2534", "age3544",
                      "age4554", "age5564", "age6599")

# Recode female as a factor to account for NA values.
tpnw$female_wtng <- as.factor(tpnw$female)
levels(tpnw$female_wtng) <- c("male", "na", "female")

# Recode northeast as a factor.
tpnw$northeast_wtng <- as.factor(tpnw$northeast)
levels(tpnw$northeast_wtng) <- c("other", "northeast")

# Recode midwest as a factor.
tpnw$midwest_wtng <- as.factor(tpnw$midwest)
levels(tpnw$midwest_wtng) <- c("other", "midwest")

# Recode south as a factor.
tpnw$south_wtng <- as.factor(tpnw$south)
levels(tpnw$south_wtng) <- c("other", "south")

# Recode west as a factor.
tpnw$west_wtng <- as.factor(tpnw$west)
levels(tpnw$west_wtng) <- c("other", "west")

## Specify population targets for balancing (from US Census 2018 data).
# Specify gender proportion targets and assign names to comport with factors.
femaletarg <- c(.508, 0, .492)
names(femaletarg) <- c("female", "na", "male")

# Specify age-group proportion targets and assign names to comport with factors.
agetarg <- c(29363, 44854, 40659, 41537, 41700, 51080)/249193
names(agetarg) <- c("age1824", "age2534", "age3544",
               "age4554", "age5564", "age6599")

# Specify northeast proportion targets and assign names to comport with factors.
northeasttarg <- c(1 - .173, .173)
names(northeasttarg) <- c("other", "northeast")

# Specify midwest proportion targets and assign names to comport with factors.
midwesttarg <- c(1 - .209, .209)
names(midwesttarg) <- c("other", "midwest")

# Specify south proportion targets and assign names to comport with factors.
southtarg <- c(1 - .380, .380)
names(southtarg) <- c("other", "south")

# Specify west proportion targets and assign names to comport with factors.
westtarg <- c(1 - .238, .238)
names(westtarg) <- c("other", "west")

# Create a list of all targets, with names to comport with relevant variables.
targets <- list(femaletarg, agetarg, northeasttarg, 
                midwesttarg, southtarg, westtarg)
names(targets) <- c("female_wtng", "age_wtng", "northeast_wtng", 
                    "midwest_wtng", "south_wtng", "west_wtng")

# Produce anesrake weights.
anesrake_out <- anesrake(targets, tpnw, caseid = tpnw$caseid,
                             verbose = TRUE)

# Append anesrake weights to TPNW experimental data.
tpnw$anesrake_weight <- anesrake_out$weightvec

# Remove variables used for weighting.
tpnw <- tpnw[-grep("wtng$", names(tpnw))]

## Write data.
# Write full experimental dataset.
write.csv(tpnw, "data/tpnw_data.csv")

# write full YouGov dataset.
write.csv(aware, "data/tpnw_aware.csv")
